#################################################
# Clear workspace and set seed
rm(list=ls())
gc()
set.seed(1845)
seed <- 1845



##############################################
# Libraries
loadPkg <- function(toLoad){
	for(lib in toLoad){
	if(! lib %in% installed.packages()[,1])
	  { install.packages(lib, repos='http://cran.rstudio.com/') }
	suppressMessages( library(lib, character.only=TRUE) )
	}
}

toLoad <- c(
	'magrittr', 'stringr', 'rjson', 'jsonlite', 'XML', 'xml2', 'htm2txt', 'stringi', 'foreign', 'data.table', 'readstata13', 'haven', # General usage
	'zoo', 'reshape2', 'xtable', 'doBy', 'stargazer', 'reshape', 'MASS', 'car', 'plyr', 'yaml', 'tidyverse', 'qdap', 'janitor', # 'feather',  #More gen usage
	'tm', 'SnowballC', 'RWeka', 'slam', 'tokenizers', 'quanteda', 'stm', 'topicmodels',# text processing packages
	'ggplot2','wordcloud', 'RColorBrewer', 'ggridges', 'egg', 'grid', 'separationplot', 'pROC', 'cowplot', 'plotrix', # viz packages
	'modeltools', 'glmnet', 'plotmo', 'lme4', # modeling 
	'parallel', 'foreach', 'doParallel', 'foreach' #parallelization
	)
loadPkg(toLoad)

# Need older version of tm package
#if(! 'tm' %in% installed.packages()[,1])
#install.packages('https://cran.r-project.org/src/contrib/Archive/tm/tm_0.7.tar.gz', repos=NULL, type="source")

# source("http://bioconductor.org/biocLite.R"); biocLite("RDRToolbox") 
# library(RDRToolbox)

# ggplot theme
theme_set(theme_bw())
##############################################

##############################################
# Helper functions
trim <- function (x) gsub("^\\s+|\\s+$", "", x) # Remove extra white space
char <- function(x){ as.character(x) } # Convert to character
num <- function(x){ as.numeric(char(x)) } # Convert to numeric
wordCnt <- function(x) { sapply(strsplit( trim(x), " "), length) }
convNumDcol <- function(data, vars){
  for(var in vars){ data[,var]=num(data[,var]) }
  return( data ) }
stdz <- function(x){ ( x-mean(x) )/sd(x) }
medNA <- function(x){ median(x, na.rm=TRUE) }
str_remove <- function(string, pattern){string[!str_detect(string, pattern)]}
comNames <- function(df1, df2){intersect(names(df1), names(df2))}
even <- function(x){ x %% 2 == 0 }
size <- function(x, unit = 'Mb'){ object.size(x) %>% print(., units = unit)}
nas <- function(x){is.na(x) %>% which() %>% length()}
classes <- function(df){sapply(names(df), function(name){class(df[,name])})}
d <- function(x){
  if(!is.vector(x) & !is.data.frame(x) & !is.matrix(x)){
    stop('Requires data frame, matrix, or vector (which includes lists).')
  } else if(is.vector(x)){
    length(x)
  } else {
    dim(x)
  }
}

##############################################

##############################################
# Other functions that are used in multiple scripts

# Function to remove commonly occuring terms from a DTM
removeCommonTerms <- function (x, pct) 
{
    stopifnot(inherits(x, c("DocumentTermMatrix", "TermDocumentMatrix")), 
              is.numeric(pct), pct > 0, pct < 1)
    m <- if (inherits(x, "DocumentTermMatrix")) 
        t(x)
    else x
    t <- table(m$i) < m$ncol * (pct)
    termIndex <- as.numeric(names(t[t]))
    if (inherits(x, "DocumentTermMatrix")) 
        x[, termIndex]
    else x[termIndex, ]
}

# Stemming function for a character vector 
stemVector = function(x, toLower=TRUE){
    if(toLower){ x=tolower(x) }
    xStem=tm_map(Corpus(VectorSource(x)), stemDocument)
    xStem=unlist(xStem)
    return( char(xStem[names(xStem) %in% 'content.content']) )
}

##############################################

###############################################
### Set list of stopwords

# From DDE
stoplist <- c('subclause', 'clause', 'subparagraphs', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday',
              'saturday', 'sunday', 'print', 'printing', 'refer', 'referred', 'referring', 'january', 'february',
              'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december',
              'provided', 'relating', 'subtitle', 'committees', 'additional', 'session', 'deleted', 'subject',
              'title', 'parts', 'chapter', 'subchapter', 'days', 'introduced', 'adding', 'relating', 'necessary',
              'later', 'pursuant', 'years', 'year', 'subsections', 'section', 'sections', 'subparagraph',
              'paragraph', 'including', 'congress', 'date', 'general', 'inserting', 'title', 'subsection',
              'shall', 'secretary', 'united', 'states', 'title', 'other', 'inserting', 'striking', 'following',
              'term', 'definition', 'definitions', 'means', 'term', 'division', 'section', 'about', 'above',
              'across', 'after', 'afterwards', 'again', 'against', 'almost', 'alone', 'along', 'already', 'also',
              'although', 'always', 'among', 'amongst', 'amoungst', 'amount', 'and', 'another', 'any', 'anyhow',
              'anyone', 'anything', 'anyway', 'anywhere', 'around', 'back', 'became', 'because', 'become', 'becomes',
              'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between',
              'beyond', 'bill', 'both', 'bottom', 'call', 'cannot', 'cant', 'could', 'couldnt', 'describe', 'described',
              'detail', 'done', 'down', 'during', 'each', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty',
              'enough', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'fifteen', 'fifty',
              'fill', 'find', 'fire', 'first', 'five', 'former', 'formerly', 'forty', 'found', 'four', 'from', 'front',
              'full', 'further', 'give', 'hasnt', 'have', 'hence', 'here', 'hereafter', 'hereby', 'herein', 'hereupon',
              'hers', 'herself', 'himself', 'however', 'hundred', 'indeed', 'interest', 'into', 'itself', 'keep', 'last',
              'latter', 'latterly', 'least', 'less', 'made', 'many', 'may', 'meanwhile', 'might', 'mill', 'mine', 'more',
              'moreover', 'most', 'mostly', 'move', 'much', 'must', 'myself', 'name', 'namely', 'neither', 'never',
              'nevertheless', 'next', 'nine', 'nobody', 'none', 'noone', 'nothing', 'nowhere', 'often', 'once', 'only',
              'onto', 'there', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'over', 'part', 'perhaps', 'please',
              'rather', 'same', 'seem', 'seemed', 'seeming', 'seems', 'serious', 'several', 'should', 'show', 'side',
              'since', 'sincere', 'sixty', 'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes',
              'somewhere', 'still', 'such', 'system', 'take', 'than', 'that', 'their', 'them', 'themselves', 'then',
              'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', 'they',
              'thick', 'thin', 'third', 'this', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus',
              'together', 'toward', 'towards', 'twelve', 'twenty', 'under', 'until', 'upon', 'very', 'well', 'were',
              'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein',
              'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'whoever', 'whole', 'whom', 'whose',
              'will', 'with', 'within', 'without', 'would', 'your', 'yours', 'yourself', 'yourselves', 'deleted', 'si���',
              'S6301I76ï¿½08', 'Saï¿½AE1nchezK')

# Native stopwords
native_stopwords <- c(tm::stopwords('en'), stopwords('english')) %>% unique()

# Stopwords from Barium
data(state) 
specWords=unique( c(
    'mr', 'ms','speaker', 'gentleman', 'gentlewoman',
    'america', 'american', 'state', 'countri', 'nation',
    'federal', 'senate', 'house', 'congress', 'joint',
    'represent', 'honor', 'people', 'year', 'time',
    'committee', 'chamber', 'member', 'chairman', 
    'chairwoman', 'time', 'yield', 'ask',
    'will', 'want', 'can', 'get', 'just', 'ratify',
    'know', 'now', 'new', 'old', 'program', 'provide',
    'bill', 'legislation', 'say', 'support', 'year', 'make',
    'without', 'object', 'order', 'recognize', 'chair', 'order',
    'unanimous', 'consent', 'order', 'second', 'yea', 'aye', 'nay',
    'h', 'res', 'clerk', 'call', 'roll', 'quorum', 'call',
    'minute', 'minutes', 'motion', 'proceed', 'also', 'pass', 'continue', 
    'colleague', 'colleagues', 'score', 'cbo', 'number', 'floor', 'resolution',
    'conference', 'report', 'rollcall', 'thank', 'distinguish',
    'reserve', 'balance', 'offer', 'ammendment', 'print',
    'record', 'follow', 'material', 'govern', 'account', 'office',
    'text', 'print', 'record', 'rise', 'today', 'preside',
    'let', 'us', 'side', 'aisle', 'thing', 'one', 'inform', 
    'please', 'contact', 'madame', 'consume', 'line', 'item', 
    'talk', 'little', 'bit', 'amdt', 'usc', 'resolv', 'hr',
    stemVector(state.name), 
    stemVector(unique(state.region)),
    stemVector(month.name), 
    stemVector(unique(weekdays(.leap.seconds)))
) )

# Combine stopword lists
stoplist <- c(stoplist, specWords, native_stopwords) %>% unique()

# List of post-stemming stopwords (single letters and stray roman numerals, plus misc words)
final_stoplist <- c(letters, 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'vii', 'ix', 'x', 'xi', 
                    'xii', 'xii', 'xii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx',
                    'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix',
                    'xxx', 'th', 'st', 'act', 'usc', 'up', 'all', 'so', 'sec', 'establish', 
                    'le') %>% unique()
